# topic 21 # First, set up the situation. We have two populations # with a known standard deviations. # source("../pop_sd.R") source("../gnrnd5.R") source("../gnrnd4.R") gnrnd5( 162353499902, 873001383) pop_one <- L1 # I can tell you that I know the standard deviation # of pop_one is 16.7252 sigma_one = 16.7252 gnrnd5( 123353499902, 1073001283) pop_two <- L1 # I can tell you that I know the standard deviation # of pop_two is 20.81073 sigma_two <- 20.81073 ################################################ ### Confidence Interval ### ################################################ # We want to construct a 95% confidence interval # for the difference of the means, mu_one - mu_two. # To do this we will take random samples of both, # the size of the sample from pop_one will be 28 # and the size of the sample from pop_two will # be 34. n1 <- 28 n2 <- 34 # for now, we will get the samples by generating the # appropriate random index numbers for the two # populations. # get the sample and sample statistics from pop_one gnrnd4( 547892701, 500000001) L1 index_one <- L1 samp_one <- pop_one[ index_one ] samp_one x_bar_one <- mean( samp_one ) x_bar_one # get the sample and sample statistics from pop_two gnrnd4( 763893301, 500000001) L1 index_two <- L1 samp_two <- pop_two[ index_two ] samp_two x_bar_two <- mean( samp_two ) x_bar_two #### Therefore we can find the difference of the sample means diff_of_means <- x_bar_one - x_bar_two diff_of_means #### This is an instance where we know the standard deviation #### of the underlying populations. Therefore, the distribution #### of the difference of the sample means will be N( mu, sigma) #### where mu is mu_one - mu_two, and sigma is #### sqrt( sigma_one^2/n1 + sigma_two^2/n2) for samples #### of size n1 and n2, respectively. ## we took our samples, (see above) so we can compute ## sd_of_diff <- sqrt( sigma_one^2/n1 +sigma_two^2/n2 ) sd_of_diff # for a 95% confidence interval, we are missing 5% total, # and we need to split this in half, a 2.5% for the # top and bottom. # # get the z value, from a N(0.1), with 2.5% above it z <- qnorm( 0.025, lower.tail=FALSE) z # then our margin of error is MOE <- z*sd_of_diff MOE # so our confidence interval goes from diff_of_means - MOE # to diff_of_means + MOE # That pattern of commands would be the same for each # instance of such a problem. We have a function # that does this. It is called ci_2known(). source("../ci_2known.R") ci_2known(sigma_one, 28, x_bar_one, sigma_two, 34, x_bar_two, 0.95) ################################################ ### Hypothesis test ### ################################################ # Now, someone says that they believe that the # difference of the means from pop_one and pop_two # mu_1 - mu_2 = 0, that the means are the same. # Thus our null hypothesis, H0, is that the difference is 0. # Our alternative hypothesis, H1, is that the # difference of the the means is greater than 0, or # that mu_one > mu_two. # We want to see if we can reject H0 in favor of H1. # We will get a sample from each # population, n1=28 and n2=34. # We are willing to be wrong in telling the # person that they are wrong 2.5% of the time! ######################## ## The critical value approach. We know that two ## samples of size 28 and 34, respectively, will have ## a standard error of the difference of the means ## be sqrt( sigma_one^2/28 + sigma_two^2/35 ) # #### we computed this above, but let us do it again ### sd_of_diff <- sqrt( sigma_one^2/28 + sigma_two^2/34 ) sd_of_diff ## Furthermore, the difference of the means will have a ## Normal distribution. z_val <- qnorm( 0.025, lower.tail=FALSE ) z_val # Because the null hypothesis is that the difference is 0 # The critical high value will be 0+z_val*sd_of_diff or z_val*sd_of_diff # our critical high value # we compare the difference of the means to that critical value diff_of_means # that difference is greater than our critical high. Therefore, # we would reject H0 in favor of H1. ###################### ## The attained significance approach asks, if H0 is true, ## then how strange would it be to get a difference of the ## means at this value or at any stranger value. But ## that is just a pnorm command pnorm( diff_of_means, mean=0, sd=sd_of_diff, lower.tail=FALSE) # # That attained significance is less than our given # level of significance, 0.025, so we would reject H0 # in favor of H1. ######################### ## Of course, for any similar problem we would be doing ## the same steps, with the additional complication of ## having to pay attention to the alternative. If H1 is ## mu_one < mu_two, which is the same as mu_one-mu_two<0, ## then we would get a critical low value and our ## pnorm would use the lower tail. If, H1 is ## mu_one != mu_two, which is the same as ## mu_one-mu_two != 0, then we would need to get both a ## critical low and a critical high, splitting the area ## of significance between the low and high. In the ## same situation, once we get our attained value on ## one side, high or low, we would need to double it to ## account for values that are just as extreme or more ## extreme on the other side. ## ## Rather than think out all of these issues each time ## we run into this problem, we might as well capture ## all of this in a function, hypoth_2test_known(). ## LEt us load and run that for the values in our example. source( "../hypo_2known.R") hypoth_2test_known( sigma_one, n1, x_bar_one, sigma_two, n2, x_bar_two, 1, 0.025 ) ###################################################### ###################################################### ## Of course, we really could have found the means ## of the two populations. mu_one <- mean( pop_one ) mu_one mu_two <- mean( pop_two ) mu_two real_diff <- mu_one - mu_two real_diff # we will run through 10000 cases where we # get samples of size 28 and 34 and we will # use those samples,specifically the sample # means, to generate a 95% confidence interval # for the difference of the population means. # In the process we will see how many of those # intervals contain the true difference. L1 <- 1:10000 for( i in 1:10000 ) { samp_one <- sample( pop_one, 28 ) samp_two <- sample( pop_two, 34 ) this_interval <- ci_2known( sigma_one, 28, mean( samp_one), sigma_two, 34, mean( samp_two), 0.95 ) if( this_interval[1]